//# define PATCHSIZE 8
//# define PATCH_OFFSET 7
//# define GRP_W 16
//# define GRP_H 8
//
//__kernel __attribute__((reqd_work_group_size(GRP_W, GRP_H, 1)))
//void decompose(
//                  global float*     input,
//                  global float*     output,
//                  const int  width,
//                  const int  height
//                  )
//{
//    const int g_x = get_global_id(0);
//    const int g_y = get_global_id(1);
//    const int l_x = get_local_id(0);
//    const int l_y = get_local_id(1);
//    const bool boundsCheck = g_x < width - PATCH_OFFSET && g_y < height - PATCH_OFFSET;
//    const int inputOffet = g_y * width + g_x;
//    const int outputOffet = inputOffet * PATCHSIZE * PATCHSIZE;
//    
//    __local float s_mem[GRP_H + PATCH_OFFSET][GRP_W + PATCH_OFFSET];
//    
//    if (boundsCheck) {
//        const bool needsRightLoad = l_x + PATCH_OFFSET > GRP_W - 1;
//        const bool needsBottomLoad = l_y + PATCH_OFFSET > GRP_H - 1;
//        
//        s_mem[l_y][l_x] = input[inputOffet];
//        
//        if (needsRightLoad) {
//            s_mem[l_y][l_x + PATCH_OFFSET] = input[inputOffet + PATCH_OFFSET];
//        }
//        
//        if (needsBottomLoad) {
//            s_mem[l_y + PATCH_OFFSET][l_x] = input[inputOffet + width * PATCH_OFFSET];
//        }
//        
//        if (needsRightLoad && needsBottomLoad) {
//            s_mem[l_y + PATCH_OFFSET][l_x + PATCH_OFFSET] = input[inputOffet + width * PATCH_OFFSET + PATCH_OFFSET];
//        }
//    }
//    
//    barrier(CLK_LOCAL_MEM_FENCE);
//    
//    if (boundsCheck) {
//        
//        float8 res[PATCHSIZE];
//        
//#pragma unroll
//        for (int j = 0; j < PATCHSIZE; ++j) {
//            __local float *D = s_mem[j + l_y] + l_x;
//            
//            res[j].s0 = D[0];
//            res[j].s1 = D[1];
//            res[j].s2 = D[2];
//            res[j].s3 = D[3];
//            
//            res[j].s4 = D[4];
//            res[j].s5 = D[5];
//            res[j].s6 = D[6];
//            res[j].s7 = D[7];
//        }
//        
//#pragma unroll
//        for (int i = 0; i < PATCHSIZE; ++i) {
//            __global float *dst = output + outputOffet + PATCHSIZE * i;
//#pragma unroll
//            for (int j = 0; j < PATCHSIZE; ++j, ++dst) {
//                *dst = res[i][j];
//            }
//        }
//    }
//}

#define WGS_W 8
#define WGS_H 8

__kernel __attribute__((reqd_work_group_size(WGS_W, WGS_H, 1)))
void decompose(
               __global float*      img,
               __global float*      out,
               int                  width,
               int                  height
               )
{
    int i = get_global_id(0)>>3;
    int j = get_global_id(1)>>3;

    int local_i = get_local_id(0);
    int local_j = get_local_id(1);
    int local_idx = (local_j<<3) + local_i;
    
    int offsetDst = ((j*width + i)<<6) + local_idx;
    
    int x = i + local_i;
    int y = j + local_j;
    
    out[offsetDst] = img[y*width + x];
}
